Version 2.0 - I’m adapting the originally published code so as to present a more coherent tutorial for the research presented in the paper “A Bayesian approach to the quantification of extremal responses in dynamic structures”, currently under review in Ocean Engineering.

First, a synthetic dataset is generated, then methodology is demonstrated, and finally the diagnostics are presented.

Generation of synthetic dataset

As the original dataset used within the research is confidential, the methodologies presented are demonstrated herein using a synthetic dataset. The dataset is generated so as to be similar to the original data.

First, we specify the number of training and validation points, and dimensions. As to align with the original research, this tutorial is presented for a single input dimension. Demonstration of the methodology with multivariate data is forthcoming.

N <- 80
N_predict <- 450
N_total <- N + N_predict
dim <- 1

Now, we generate a Latin hypercube spacing over the inputs, randomly sample the training points, and assign the remainder to act as validation points.

x_lhs <- maximinLHS(N_total, dim)

sample_index <- sample(c(1:N_total), N, replace = FALSE)

x_train <- x_lhs[sample_index,]
x_pred <- x_lhs[-sample_index,]

x_all <- c(x_train, x_pred)

To generate a representative synthetic dataset, a Gaussian distribution is specified — non-stationary over the inputs — with a constant standard deviation, \(\sigma_d\), and a mean, \(\boldsymbol{\mu}_d\), sampled from a zero-mean Gaussian process with squared-exponential covariance function defined by an amplitude parameter \(\alpha_d\) and length-scale \(\rho_d\). The squared exponential covariance function is coded into the function SE(), defined below.

SE <- function(d,alpha,rho) {
  alpha * exp(-d^2/(2*rho))
}

First, parameters \(\sigma_d\), \(\alpha_d\), and \(\rho_d\) are specified. Then we sample \(\boldsymbol{\mu}_d\) such that \(\boldsymbol{\mu}_d \sim \mathcal{GP}(0,k(x,x'))\) where \(k(x,x')\) represents the squared-exponential covariance function. Finally, 1000 draws of the data, \(y\) are made at each of the inputs x_all, where \(y \mid x \sim \mathcal{N}(\boldsymbol{\mu}_d(x), \sigma_d)\).

sigma_data <- 0.1
alpha_data <- 3
rho_data <- 0.2

d <- as.matrix(dist(x_all))
K_data_mean <- SE(d, alpha_data, rho_data)

mean_data <- as.vector(rmvnorm(1,rep(0,N_total),K_data_mean))

data_sample_size <- 1000

data_samples <- data.frame(matrix(NA, N_total, data_sample_size))
for (i in 1:data_sample_size) {
  data_samples[ , i] <- rnorm(N_total,mean_data, sigma_data)
}
data_samples_melt <- melt(data_samples)
data_samples_melt[ ,1] <- x_all

From these data we extract the maxima, empirical estimates of the 99th quantiles, and points exceeding the empirical estimates.

data_max <- data.frame(x = x_all, max = apply(data_samples, 1, max))

quant_threshold <- 0.99

data_quantile <- data.frame(
    x = x_all, 
    quant = apply(data_samples, 1, quantile, quant_threshold)
  )

df_exceed <- data.frame(x = numeric(), exceed = numeric())

for (i in 1:N_total) {
  index_tmp <- subset(
      data_samples[i,] > apply(data_samples, 1, quantile, quant_threshold)[i]
    )
  data_exceed <- as.numeric(data_samples[i, index_tmp])
  df_exceed_tmp <- data.frame(
      x = rep(x_all[i], length(data_exceed)), 
      exceed = data_exceed
    )
  df_exceed <- rbind(df_exceed, df_exceed_tmp)
}

These values are plotted in the figure below. The synthetic data is represented in black, the empirical quantile estimates in green, and the threshold exceedance points in blue, and the maxima in red.

ggplot(data = data_samples_melt, aes(x = variable, y = value)) +
    geom_point(alpha = 0.005) +
    geom_point(data = data_quantile, aes(x = x, y = quant), colour = "green") +
    geom_point(data = df_exceed, aes(x = x, y = exceed), colour = "blue", alpha = 0.05) +
    geom_point(data = data_max, aes(x = x, y = max), colour = "red", alpha = 0.3)

Sampling the GEV model

The stan model to estimate the model from the “observed” data given below, and saved in the object synth_gev. Note that for the sake of tutorial simplicity the noise term has been simplified to a constant value. The original stan models including the log-linear function is uploaded to the same GitHub repository.

\[\begin{aligned} (z(x_i) \mid \mu(x_i), \sigma_z(x_i), \xi_z) &\sim GEV(\mu(x_i), \sigma_z(x_i), \xi_z \mid \boldsymbol{\theta}, \boldsymbol{\sigma}_{z}) \\ (\boldsymbol{\mu}\mid \boldsymbol{\theta}) &\sim \mathcal{GP}(0, \lambda_1(\cdot,\cdot \mid \boldsymbol{\theta})) \\ p(\xi_z, \boldsymbol{\theta}, \boldsymbol{\sigma}_{z}) &= p(\xi_z)p(\boldsymbol{\theta})p(\boldsymbol{\sigma}_{z}), \end{aligned}\]

\[ p(\boldsymbol{\mu}, \boldsymbol{\sigma}_{z}, \xi_z, \boldsymbol{\theta}\mid \boldsymbol{z}) \propto p(\boldsymbol{z}\mid \boldsymbol{\mu}, \boldsymbol{\sigma}_{z}, \xi_z) \; p(\boldsymbol{\mu}\mid \boldsymbol{\theta}) \; p(\xi_z, \boldsymbol{\theta}, \boldsymbol{\sigma}_{z}).\]


functions {
  vector gp_pred_rng(real[] x2,
                     vector z1, real[] x1,
                     real alpha, real rho, real delta) {
    int N1 = rows(z1);
    int N2 = size(x2);
    vector[N2] f2;
    {
      matrix[N1, N1] K =   cov_exp_quad(x1, alpha, rho) + diag_matrix(rep_vector(1e-06, N1));
      matrix[N1, N1] L_K = cholesky_decompose(K);

      vector[N1] L_K_div_z1 = mdivide_left_tri_low(L_K, z1);
      vector[N1] K_div_z1 = mdivide_right_tri_low(L_K_div_z1', L_K)';
      matrix[N1, N2] k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
      vector[N2] f2_mu = (k_x1_x2' * K_div_z1);
      matrix[N1, N2] v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
      matrix[N2, N2] cov_f2 =   cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred
                              + diag_matrix(rep_vector(delta, N2));
      f2 = multi_normal_rng(f2_mu, cov_f2);
    }
    return f2;
  }
}

data {
  int<lower=1> N;

  real x[N];
  vector[N] y;

  int<lower=1> N_predict;
  real x_predict[N_predict];
}

parameters {
  real<lower=0> rho_mu;
  real<lower=0> alpha_mu;
  real<lower=-0.5,upper=0.5> xi;
  real<lower = 0> sigma;
  vector[N] mu_latent;
}

model {
  matrix[N, N] cov_mu =   cov_exp_quad(x, alpha_mu, rho_mu) + diag_matrix(rep_vector(1e-06, N));
  matrix[N, N] L_cov_mu = cholesky_decompose(cov_mu);

  real t_x[N];

  rho_mu ~ normal(0, 20.0 / 3);
  alpha_mu ~ normal(0, 2);

  xi ~ normal(0,1);

  mu_latent ~ multi_normal_cholesky(rep_vector(0, N), L_cov_mu);

  for (i in 1:N) {
    if (xi == 0 ) {
      t_x[i] = exp(-(y[i]-mu_latent[i])/sigma);
      } else {
      t_x[i] = (1 + xi * (y[i] - mu_latent[i]) / sigma)^(-1/xi);
    }
    target += - log(sigma) + (xi + 1) * log(t_x[i]) - (t_x[i]);
  }

}

generated quantities {
  vector[N_predict] mu_predict = gp_pred_rng(x_predict, mu_latent, x, alpha_mu, rho_mu, 1e-10);
}

To estimate the model parameters, we specify the training data in a list, along with the points that we wish to validate the model at. The specification of N_predict and x_predict are not necessary here, they simply allow for the efficient computation of the predictive quantities of \(\boldsymbol{\mu}\) in the generated_quantities section of the stan model. We initialise the model at the true parameter values. In practice we do not know these quantities, however for the purposes of exposition we save ourselves much heartache and start in the sampler in the correct posterior location. Finally, traceplots, pairs plots, and posterior predictive quantities are provided. Posterior predictive values are calculated according to

\[ p(z^* \mid \boldsymbol{z}) = \int \int \dots \int p(z^* \mid \mu(x^*), \boldsymbol{\sigma}_{z}, \xi_z) \; p(\mu(x^*) \mid \boldsymbol{\theta}, \boldsymbol{\mu}) \; p(\boldsymbol{\mu}, \boldsymbol{\sigma}_{z}, \xi_z, \boldsymbol{\theta}\mid \boldsymbol{z}) \; d\mu(x^*) \: d\boldsymbol{\mu}\: d\boldsymbol{\sigma}_{z} \: d\xi_z \: d\boldsymbol{\theta}.\]

quant_gev_stan <- function (stan_list, sample_length, N_predict, q) {
  quant <- data.frame(matrix(NA, nrow = sample_length, ncol = N_predict))
  for (i in 1:sample_length) {
    quant[i, ] <- qgev(rep(q, N_predict), as.numeric(stan_list$mu_predict[i, ]), 
                       stan_list$sigma[i], stan_list$xi[i])
    cat(sprintf('\rIteration %d/%d', i, sample_length))
  }
  return(quant)
}
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

data <- list(N = N,
             x = x_train,
             y = data_max$max[1:N],
             N_predict = N_predict,
             x_predict = x_pred)

param_list <- list(
    rho_mu = 0.2, 
    alpha_mu = 3, 
    xi = 0.2, 
    sigma = 0.05, 
    mu_latent = data_max$max[1:N]
  )

gev_fit_synth <- stan(
    file='01a_synth_gev.stan', 
    data=data, 
    chains = 4, 
    seed=5838298, 
    iter = 1250, 
    warmup = 250, 
    init = list(param_list, param_list, param_list, param_list),
    control = list(adapt_delta = 0.99, max_treedepth = 15)
  )

traceplot(gev_fit_synth)
pairs(gev_fit_synth, pars = c("rho_mu", "alpha_mu", "xi", "sigma"))

gev_fit_extract <- extract(gev_fit_synth)

sample_length <- dim(gev_fit_extract$xi)

qgev_list <- pblapply(
  c(0.005,0.05,0.95,0.995),
  function(q) {
    pbapply(
      quant_gev_stan(gev_fit_extract, sample_length, N_predict, q),
      2, 
      mean)
  }
)

qgev_pred_df <- data.frame(matrix(unlist(qgev_list), nrow = N_predict))
names(qgev_pred_df) <- c("q005","q025","q975","q995")
qgev_pred_df$x <- x_pred


ggplot(data = qgev_pred_df, aes(x = x)) +
  geom_ribbon(aes(ymin = q025, ymax = q975), alpha = 0.3) +
  geom_ribbon(aes(ymin = q005, ymax = q995), alpha = 0.1) +
  geom_point(data = data_max, aes(x = x, y = max), colour = "blue", alpha = 0.3) +
  geom_point(data = data_max[1:N, ], aes(x = x, y = max), colour = "red")
## 'pars' not specified. Showing first 10 parameters by default.

Quantile Regression

The stan model to estimate the quantiles from the observed data is given below. This code models the latent quantiles as draws from a Gaussian process.

functions {
  vector gp_pred_rng(real[] x2,
                     vector z1, real[] x1,
                     real alpha, real rho, real delta) {
    int N1 = rows(z1);
    int N2 = size(x2);
    vector[N2] f2;
    {
      matrix[N1, N1] K =   cov_exp_quad(x1, alpha, rho) + diag_matrix(rep_vector(1e-06, N1));
      matrix[N1, N1] L_K = cholesky_decompose(K);

      vector[N1] L_K_div_z1 = mdivide_left_tri_low(L_K, z1);
      vector[N1] K_div_z1 = mdivide_right_tri_low(L_K_div_z1', L_K)';
      matrix[N1, N2] k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
      vector[N2] f2_mu = (k_x1_x2' * K_div_z1);
      matrix[N1, N2] v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
      matrix[N2, N2] cov_f2 =   cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred
                              + diag_matrix(rep_vector(delta, N2));
      f2 = multi_normal_rng(f2_mu, cov_f2);
    }
    return f2;
  }
}

data {
  int<lower=1> N;

  real x[N];
  vector[N] y;

  int<lower=1> N_predict;
  real x_predict[N_predict];
}

parameters {
  real<lower=0> rho;
  real<lower=0> alpha;
  real<lower=0> sigma;
}

model {

  matrix[N, N] cov =   cov_exp_quad(x, alpha, rho) + diag_matrix(rep_vector(sigma,N));
  matrix[N, N] L_cov = cholesky_decompose(cov);

  rho ~ normal(0, 20.0 / 3);
  alpha ~ normal(0, 2);
  
  y ~ multi_normal_cholesky(rep_vector(0, N), L_cov);

}

generated quantities {
  vector[N_predict] mu_predict = gp_pred_rng(x_predict, y, x, alpha, rho, 1e-10);
}

The code executing the stan model is provided below. Traceplots, pairs plots and the mean predictive quantiles, \(\mathbb{E}[M_2(\cdot;\boldsymbol{\phi})]\), are shown.

data_quant <- list(N = N,
    x = x_train,
    y = data_quantile$quant[1:N],
    N_predict = N_predict + N,
    x_predict = x_all
  )

param_list <- list(
    alpha = 3,
    rho = 0.2,
    sigma = 0.05
  )

quant_fit_synth <- stan(
    file='01b_synth_quant.stan', 
    data=data_quant, 
    chains = 4, 
    seed=5838298, 
    iter = 500, 
    warmup = 250, 
    init = list(param_list, param_list, param_list, param_list),
    control = list(adapt_delta = 0.99, max_treedepth = 15)
  )

traceplot(quant_fit_synth)
pairs(quant_fit_synth, pars = c("rho", "alpha", "sigma"))

quant_fit_extract <- extract(quant_fit_synth)

data_quantile$mean <- apply(quant_fit_extract$mu_predict, 2, mean)

ggplot(data = data_quantile, aes(x = x)) + 
  geom_point(aes(y = quant)) +
  geom_line(aes(y = mean), colour = "red")
## 'pars' not specified. Showing first 10 parameters by default.

Threshold Exceedance modelling

Finally, the stan model for the exceedance modelling is presented below. Note that the GenP distribution does not already exist in stan, and so must be given as a user-defined function. As above, the model is simplified to not consider the non-stationary effects of \(\hat{bsigma}_z\). The original stan model used by the research is available in the GitHub repository. The stan model represents the following mathematics:

\[\begin{aligned} (\textbf{y}_{t,i} \mid u(x_i), \tilde{\sigma}_z(x_i), \xi_y) &\sim GP(u(x_i), \tilde{\sigma}_z(x_i), \xi_y \mid \boldsymbol{\phi}, \hat{\boldsymbol{\sigma}}_{z}) \\ (u(\cdot) \mid \tilde{Q}_\alpha(\textbf{x}), \boldsymbol{\phi}) &\sim \mathcal{GP}(0, \lambda_2(\cdot,\cdot \mid \boldsymbol{\phi})) \\ p(\xi_y, \boldsymbol{\phi}, \hat{\boldsymbol{\sigma}}_{z}) &= p(\xi_y)p(\boldsymbol{\phi})p(\hat{\boldsymbol{\sigma}}_{z}), \end{aligned}\]

\[\begin{equation*} p(\textbf{u}, \hat{\boldsymbol{\sigma}}_{z}, \xi_y, \boldsymbol{\phi}\mid \boldsymbol{Y}_t, \tilde{Q}_\alpha(\textbf{x})) \propto p(\boldsymbol{Y}_t \mid \textbf{u}, \hat{\boldsymbol{\sigma}}_{z}, \xi_y) \; p(\textbf{u}\mid \tilde{Q}_\alpha(\textbf{x}), \boldsymbol{\phi}) \; p(\xi_y, \boldsymbol{\phi}, \hat{\boldsymbol{\sigma}}_{z}). \end{equation*}\]

functions {
    real gpareto_1dim_lpdf(real y, real ymin, real xi, real sigma) {
    // generalised Pareto log pdf 
    // int N = rows(y);
    real inv_xi = inv(xi);
    if (xi<0 && (y-ymin)/sigma > -inv_xi)
      reject("xi<0 and max(y-ymin)/sigma > -1/xi; found xi, sigma =", xi, sigma)
    if (sigma<=0)
      reject("sigma<=0; found sigma =", sigma)
    if (fabs(xi) > 1e-15)
      return - (1+inv_xi) * (log1p((y-ymin) * (xi/sigma))) - log(sigma);
    else
      return - (y-ymin)/sigma - log(sigma); // limit xi->0
  }
}

data {
  int<lower=1> N;

  real x[N];
  vector[N] y;
  vector[N] u;

}

parameters {
  real<lower=-0.5,upper=0.5> xi;
  real<lower=0> sigma_hat;
}


model {
  xi ~ normal(0,1);
  sigma_hat ~ normal(0,1);

  for (i in 1:N) {
    y[i] ~ gpareto_1dim(u[i], xi, sigma_hat);
  }

}

Finally, the code to execute the stan code and estimate the exceedance model is provided below. Figures depict the exceedance data given the mean quantile estimate, traceplots, pairs plots, and finally the posterior predictive quantities calculated according to

\[ p(y_t^* \mid \boldsymbol{Y}_t) = \int \int \dots \int p(y_t^* \mid u(x^*), \hat{\boldsymbol{\sigma}}_{z}, \xi_y) \; p(u(x^*) \mid \textbf{u}, \boldsymbol{\phi}) \; p(\textbf{u}, \hat{\boldsymbol{\sigma}}_{z}, \xi_y, \boldsymbol{\phi}\mid \boldsymbol{Y}_t, \tilde{Q}_\alpha(\textbf{x})) \; du(x^*) \: d\textbf{u}\: d\hat{\boldsymbol{\sigma}}_{z} \: d\xi_y \: d\boldsymbol{\phi}.\]

quant_gpd_stan <- function (stan_list, u, sample_length, N_predict, q) {
  quant <- data.frame(matrix(NA, nrow = sample_length, ncol = N_predict))
  for (i in 1:sample_length) {
    quant[i, ] <- qgpd(
        rep(q, N_predict), 
        u, 
        stan_list$sigma_hat[i], 
        stan_list$xi[i]
      )
    cat(sprintf('\rIteration %d/%d', i, sample_length))
  }
  return(quant)
}
df_exceed_quant_est <- data.frame(x = numeric(), exceed = numeric())

for (i in 1:N_total) {

  index_tmp <- subset(
      data_samples[i,] > data_quantile$mean[i]
    )

  data_exceed <- as.numeric(data_samples[i, index_tmp])

  df_exceed_quant_est_tmp <- data.frame(
      x = rep(x_all[i], length(data_exceed)), 
      quant = rep(data_quantile$mean[i], length(data_exceed)),
      exceed = data_exceed
    )

  df_exceed_quant_est <- rbind(df_exceed_quant_est, df_exceed_quant_est_tmp)
}

ggplot() +
  geom_point(data = df_exceed_quant_est, aes(x = x, y = exceed), colour = "blue", alpha = 0.1) +
  geom_line(data = data_quantile, aes(x = x, y = mean), colour = "red") +
  geom_point(data = data_max, aes(x = x, y = max), colour = "red", alpha = 0.2)

train_index <- numeric(0)
for (i in 1:N) {
  train_index <- c(train_index, which(x_train[i] == df_exceed_quant_est$x))
}

N_exceed_train <- length(train_index)
N_exceed_total <- nrow(df_exceed_quant_est)
N_exceed_predict <- N_exceed_total - N_exceed_train

data_exceed <- list(N = N_exceed_train,
    x = df_exceed_quant_est$x[train_index],
    u = df_exceed_quant_est$quant[train_index],
    y = df_exceed_quant_est$exceed[train_index]
  )

param_list <- list(
    xi = 0.01,
    sigma_hat = 0.05
  )

exceed_fit_synth <- stan(
    file='01c_synth_gpd.stan', 
    data=data_exceed, 
    chains = 4, 
    seed=5838298, 
    iter = 500, 
    warmup = 250, 
    init = list(param_list, param_list, param_list, param_list),
    control = list(adapt_delta = 0.99, max_treedepth = 15)
  )

traceplot(exceed_fit_synth)
pairs(exceed_fit_synth)

exceed_fit_extract <- extract(exceed_fit_synth)

sample_length <- dim(exceed_fit_extract$xi)

qgpd_list <- pblapply(
  c(0.005,0.05,0.95,0.995),
  function(q) {
    pbapply(
      quant_gpd_stan(
        exceed_fit_extract, 
        df_exceed_quant_est$quant, 
        sample_length, 
        N_exceed_total, 
        q),
      2, 
      mean)
  }
)

qgpd_pred_df <- data.frame(matrix(unlist(qgpd_list), nrow = N_exceed_total))
names(qgpd_pred_df) <- c("q005","q025","q975","q995")
qgpd_pred_df$x <- df_exceed_quant_est$x


ggplot(data = qgpd_pred_df, aes(x = x)) +
  geom_ribbon(aes(ymin = q025, ymax = q975), alpha = 0.3) +
  geom_ribbon(aes(ymin = q005, ymax = q995), alpha = 0.1) +
  geom_point(data = df_exceed_quant_est, aes(x = x, y = exceed), colour = "blue", alpha = 0.1) +
  geom_line(data = data_quantile, aes(x = x, y = mean), colour = "red")